{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Constructing independent variable: Has certainty?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [],
   "source": [
    "import nltk\n",
    "from nltk.corpus import wordnet\n",
    "from nltk.corpus import wordnet as wn"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [],
   "source": [
    "from nltk.tokenize import TweetTokenizer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [],
   "source": [
    "from nltk.corpus import stopwords\n",
    "stop_words = stopwords.words('english')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [],
   "source": [
    "from nltk.tag import pos_tag\n",
    "from nltk.stem.wordnet import WordNetLemmatizer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re, string\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json \n",
    "import pandas as pd\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [],
   "source": [
    "import liwc\n",
    "parse, category_names = liwc.load_token_parser('LIWC2015_Dictionary.dic')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [],
   "source": [
    "def getSyns(list_of_words):\n",
    "    wset = set()\n",
    "    for syns in [wn.synsets(t) for t in list_of_words]:\n",
    "        ws = [syn.lemmas()[0].name() for syn in syns]\n",
    "        wset.update(ws)\n",
    "        wset.update(list_of_words)\n",
    "    return list(wset)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [],
   "source": [
    "def lemmatize_sentence(tokens):\n",
    "    lemmatizer = WordNetLemmatizer()\n",
    "    lemmatized_sentence = []\n",
    "    for word, tag in pos_tag(tokens):\n",
    "        if tag.startswith('NN'):\n",
    "            pos = 'n'\n",
    "        elif tag.startswith('VB'):\n",
    "            pos = 'v'\n",
    "        else:\n",
    "            pos = 'a'\n",
    "        lemmatized_sentence.append(lemmatizer.lemmatize(word, pos))\n",
    "    return lemmatized_sentence"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [],
   "source": [
    "def remove_noise(tweet_tokens, stop_words):\n",
    "\n",
    "    cleaned_tokens = []\n",
    "\n",
    "    for token, tag in pos_tag(tweet_tokens):\n",
    "        token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\\(\\),]|'\\\n",
    "                       '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', token)\n",
    "        token = re.sub(\"(@[A-Za-z0-9_]+)\",\"\", token)\n",
    "\n",
    "        if tag.startswith(\"NN\"):\n",
    "            pos = 'n'\n",
    "        elif tag.startswith('VB'):\n",
    "            pos = 'v'\n",
    "        else:\n",
    "            pos = 'a'\n",
    "\n",
    "        lemmatizer = WordNetLemmatizer()\n",
    "        token = lemmatizer.lemmatize(token, pos)\n",
    "\n",
    "        if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:\n",
    "            cleaned_tokens.append(token.lower())\n",
    "    return cleaned_tokens"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [],
   "source": [
    "def hasTargetTokenCategory(token, target_token_category): \n",
    "    return target_token_category in [category for category in parse(token)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [],
   "source": [
    "def countPercentage(list_of_token, target_token_category): \n",
    "    counter = 0\n",
    "    for token in list_of_token:\n",
    "        if hasTargetTokenCategory(token, target_token_category):\n",
    "            counter+=1\n",
    "    percent = counter/len(list_of_token)\n",
    "    return percent"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [],
   "source": [
    "def calculateCertaintyScores(twts, stopwords, target_token_category):\n",
    "    result_twts = list()\n",
    "    for twt in twts: \n",
    "        if 'text' in twt.keys(): \n",
    "            text_key = 'text'\n",
    "        else:\n",
    "            text_key = 'full_text'\n",
    "        tknzr = TweetTokenizer()\n",
    "        token_list = tknzr.tokenize(twt[text_key])\n",
    "        \n",
    "        token_list_cleaned = remove_noise(token_list, stopwords)\n",
    "        \n",
    "        percent = countPercentage(token_list_cleaned, target_token_category)\n",
    "\n",
    "        #construct the binary variable\n",
    "        if percent > 0:\n",
    "            score = 1\n",
    "        else: \n",
    "            score = 0 \n",
    "        \n",
    "        result_twt = dict()\n",
    "        result_twt['id'] = twt['id']\n",
    "        result_twt[target_token_category] = score\n",
    "        result_twts.append(result_twt)\n",
    "    return result_twts"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 105,
   "metadata": {},
   "outputs": [],
   "source": [
    "def ExcludeTwtKeywords(twts, target_keywords): \n",
    "    result_twts = list()\n",
    "    for twt in twts: \n",
    "        if sum([(k in target_keywords) for k in twt['keywords']]) < len(twt['keywords']):\n",
    "            result_twts.append(twt)\n",
    "    return result_twts\n",
    "            "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('anti_vax.json') as f:\n",
    "    anti_twts = json.load(f)\n",
    "with open('pro_vax.json') as f:\n",
    "    pro_twts = json.load(f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 106,
   "metadata": {},
   "outputs": [],
   "source": [
    "#excluding data keywords that have low precision for being a tweet discussion about data\n",
    "anti_twts_cleaned = ExcludeTwtKeywords(anti_twts, ['plot','plots','viz','vis'])\n",
    "pro_twts_cleaned = ExcludeTwtKeywords(pro_twts, ['plot','plots','viz','vis'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 110,
   "metadata": {},
   "outputs": [],
   "source": [
    "#category names: 'certain',\n",
    "anti_cert = calculateCertaintyScores(anti_twts_cleaned, stop_words, 'certain')\n",
    "pro_cert = calculateCertaintyScores(pro_twts_cleaned, stop_words, 'certain')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 111,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_anti['type'] = 'anti'\n",
    "df_pro['type'] = 'pro'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 116,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_anti.append(df_pro).to_csv('results/certainty.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [conda env:base] *",
   "language": "python",
   "name": "conda-base-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
